library(pROC)
library(ggpubr)
library(Biostrings)
## Warning: package 'BiocGenerics' was built under R version 4.0.5
library(data.table)
library(dplyr)
library(PepTools)
library(cowplot)

library(rstatix)

library(purrr)
library(tidyverse)
library(yardstick)
library(doParallel)
library(foreach)
library(stringdist)
library(caret)
library(elucidate)

0.1 Introduction

0.2 useful Functions

# Function to provide a closest match. Used to match HLA Alleles across mixed output styles.
ClosestMatch2 = function(string, stringVector){

  stringVector[amatch(string, stringVector, maxDist=Inf)]

}

1 Pathogenic vs cancer peptides

1.1 Prepare TESLA dataset

  • TESLA dataset comes with nM binding affinity from netMHCpan 4.0 and hrs stability from netMHCstabpan.
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
TESLA= fread("TESLA_DATASET_608.csv")

TESLA=TESLA %>% separate(col=PMHC, into = c("HLA_Allele","Peptide"),sep="_")%>% mutate(HLA_Allele = paste0("HLA-",HLA_Allele))%>%
  mutate(Length = nchar(Peptide))%>% mutate(VALIDATED = ifelse(VALIDATED==FALSE,"Negative","Positive"))%>% dplyr::rename(Immunogenicity=VALIDATED)

TESLA=TESLA %>% mutate(nM = NETMHC_PAN_BINDING_AFFINITY) %>% mutate("Thalf(h)" = BINDING_STABILITY) %>% mutate(HydroFraction = FRAC_HYDROPHOBIC)

TESLA %>% glimpse()
## Rows: 608
## Columns: 26
## $ HLA_Allele                  <chr> "HLA-A*01:01", "HLA-A*01:01", "HLA-A*01:01…
## $ Peptide                     <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PATIENT_ID                  <int> 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, …
## $ TISSUE_TYPE                 <chr> "PBMC", "PBMC", "PBMC", "PBMC", "PBMC", "P…
## $ MHC                         <chr> "A*01:01", "A*01:01", "A*01:01", "A*01:01"…
## $ ALT_EPI_SEQ                 <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", …
## $ PEP_LEN                     <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ MEASURED_BINDING_AFFINITY   <dbl> 45, 33, 77, 0, 3, 1320, NA, 354, 30, 9, NA…
## $ NETMHC_PAN_BINDING_AFFINITY <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ TUMOR_ABUNDANCE             <dbl> 57.819051, 51.857250, 30.389590, 159.96769…
## $ BINDING_STABILITY           <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ FRAC_HYDROPHOBIC            <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…
## $ AGRETOPICITY                <dbl> 0.019019295, 0.838764951, 0.010851108, 1.2…
## $ FOREIGNNESS                 <dbl> 0.000000e+00, 0.000000e+00, 9.280000e-20, …
## $ MUTATION_POSITION           <int> 2, 3, 11, 5, 8, 10, 9, 8, 7, 1, 2, 1, 1, 7…
## $ NUMBER_PREDICTING           <int> 11, 10, 8, 13, 13, 8, 5, 12, 6, 8, 6, 16, …
## $ Immunogenicity              <chr> "Negative", "Negative", "Negative", "Negat…
## $ TCR_FLOW_I                  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_I_QUANT            <dbl> 0.0000, 0.0000, 0.0011, 0.0000, 0.0000, 0.…
## $ TCR_NANOPARTICLE            <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ TCR_FLOW_II                 <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ TCR_FLOW_II_QUANT           <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, …
## $ Length                      <int> 9, 10, 11, 9, 9, 10, 9, 9, 8, 9, 8, 9, 10,…
## $ nM                          <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.…
## $ `Thalf(h)`                  <dbl> 0.76, 2.20, 2.24, 2.28, 1.21, 2.80, 1.82, …
## $ HydroFraction               <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667…

1.1.1 Generate TESLA data binding affinity rank scores

  • Use netMHCpan to get the rank score.

TESLA$HLA_Allele = gsub(x=TESLA$HLA_Allele,pattern="\\*",replacement = "")

TEST_DATA_LOCATION = "TESLA_NETMHC/"
for(allele_i in 1:length(unique(TESLA$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(TESLA$HLA_Allele)[allele_i],pattern="\\:|\\*",replacement = "")


    LENGTHS=TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
    write.table(TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(TESLA$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))

}

1.1.2 Read in binding affinity rank scores

# Read from folder.
TEST_DATA_LOCATION = "TESLA_NETMHC/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# map HLA allele nomenclature
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))

Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(TESLA$HLA_Allele))
# Confirm 608 rows before and after joining with full TESLA data
Netmhcpanres %>% nrow
## [1] 608
TESLA %>% nrow
## [1] 608
# Join dataset
TESLA=TESLA %>% inner_join(Netmhcpanres %>% select(Peptide, HLA_Allele, Rank) %>% distinct())
## Joining, by = c("HLA_Allele", "Peptide")
TESLA %>% nrow
## [1] 608

1.1.3 Run netMHCstabpan to generate binding stability rank scores

dir.create("TESLA_STABPAN")
TEST_DATA_LOCATION = "TESLA_STABPAN/"
TESLA$HLA_Allele = gsub(x=TESLA$HLA_Allele,pattern="\\*",replacement = "")

#TESLA=TESLA %>% #add star and colon
 # mutate(HLA_Allele = gsub('^(.{8})(.*)$', '\\1:\\2', gsub('^(.{5})(.*)$', '\\1*\\2', HLA_Allele)))


for(allele_i in 1:length(unique(TESLA$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(TESLA$HLA_Allele)[allele_i],pattern=":",replacement = "")
    DATASET = TESLA %>% filter(HLA_Allele %in% unique(TESLA$HLA_Allele)[allele_i])
    for(i in 1:length(unique(DATASET$Length))){
        LENGTH = unique(DATASET$Length)[i]
        testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
        write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
    # Run model
        RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
        system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(TESLA$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
    }
}

1.1.4 Read in binding stability rank scores from netMHCstabpan

TEST_DATA_LOCATION = "TESLA_STABPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(TESLA$HLA_Allele))

Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele, Rank) %>% dplyr::rename(StabRank=Rank)

TESLA=TESLA %>% select(!"Thalf(h)") %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))%>% distinct()
TESLA %>% nrow
## [1] 608

1.2 Read in pathogenic peptides data

# Read in RDS file of MHCI Human peptide immunogenicity dataset.
FullDataset= readRDS("MHCI_Human_StricterCriteria_DiffFeatures_pathogenics.rds")

1.3 Prepare pathogenic dataset

1.3.1 Criteria

  • Only one peptide,imm,hla observation
  • Excluded any sequences composed of some non amino acid characters sometimes found in these datasets
  • Filter for peptides of length 8-14 (same as TESLA).
  • Do a little cleaning
  • Filter for specific alleles i,.e those which stabpan can process from HLAs A,B,C
  • After applying these filters there are peptides from a vast array of HLAs
# Seperate the rows for one row per Peptide, Immunogenicity, HLA observation
FullDataset=FullDataset %>% separate_rows(HLA_Allele, sep="\\|")
FullDataset=FullDataset %>% select(!c(MHCType,Sources))

FullDataset %>% select(Peptide, Immunogenicity, HLA_Allele) %>% dupes()
## No column names specified - using all columns.
## No duplicates detected.
## # A tibble: 0 × 4
## # … with 4 variables: Peptide <chr>, Immunogenicity <chr>, HLA_Allele <chr>,
## #   n_copies <int>
# Perform some cleaning
FullDataset$HLA_Allele=gsub(FullDataset$HLA_Allele,pattern="\\*",replacement="")
# Exclude unusable peptides.
FullDataset=FullDataset %>% filter(!Peptide %in%  grep(FullDataset$Peptide,pattern="X|l|-",value=T))
# This filter had already been applied, but double check no peptides originating from humans, i.e., no neoantigens.
FullDataset=FullDataset %>% filter(!Antigen_Organism %in% grep("Homo sapiens|cancer",Antigen_Organism,value=TRUE,ignore.case = T))
#only 8-14mers
FullDataset=FullDataset %>% filter(nchar(Peptide) %in% 8:14)

FullDataset$HLA_Allele = gsub(",",FullDataset$HLA_Allele,replacement = "")
FullDataset$Antigen_Name = gsub(",",FullDataset$Antigen_Name,replacement = "")
FullDataset$Antigen_Organism = gsub(",",FullDataset$Antigen_Organism,replacement = "")
FullDataset$Host_Organism= gsub(",",FullDataset$Host_Organism,replacement = "")
FullDataset=FullDataset  %>% mutate(Length = nchar(Peptide))

# Filter for stabpan alleles.
ALLELES=fread("../../STABPAN_MHC_allele_names.txt",header=F)
FullDataset=FullDataset %>% filter(HLA_Allele %in% grep("HLA-A|HLA-B|HLA-C", HLA_Allele,value = T))
FullDataset=FullDataset %>%  filter(HLA_Allele %in% ALLELES$V1)

FullDataset=FullDataset %>% distinct(Peptide,Immunogenicity,HLA_Allele,.keep_all = T)


FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive 
##    26121     8530
FullDataset %>% select(Immunogenicity) %>% table%>% prop.table()
## .
##  Negative  Positive 
## 0.7538311 0.2461689
FullDataset %>% nrow
## [1] 34651

1.3.2 Run netMHCpan 4.0 for BA rank/nM

  • Generate nM and rank binding affinity scores

TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
    write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))

}

1.3.3 Read in netMHCpan results

TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_NETMHCPAN/"
data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 34650

1.3.4 Filter only for binder to MHC for susbequent analysis

  • Exclude observations which are predicted to be non-binders
FullDataset=FullDataset %>% filter(Binder ==  'BINDER')
FullDataset %>% nrow
## [1] 23958
FullDataset %>% select(Immunogenicity) %>% table
## .
## Negative Positive 
##    17664     6294
FullDataset %>% select(Immunogenicity) %>% table %>% prop.table()
## .
##  Negative  Positive 
## 0.7372903 0.2627097

1.3.5 Run netmhc stabpan

  • Generate a binding stability prediction

TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
    for(i in 1:length(unique(DATASET$Length))){
        LENGTH = unique(DATASET$Length)[i]
        testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
        write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
    # Run model
        RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
        system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
    }
}

1.3.6 Read in predictions from stab pan

  • Read in rank and hrs stability score.
TEST_DATA_LOCATION = "PATHOGENIC_EPITOPES_STABPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
# Extract HLA and clean
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele, Rank) %>% dplyr::rename(StabRank=Rank)

FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide","HLA_Allele"))%>% distinct()
# 23958 obs so joining worked correctly.
FullDataset %>% nrow
## [1] 23958

1.3.7 Compute hydrophobic fraction

library(Peptides)

library(stringi)

HYDROPHOBIC_RESIDUES = c("V","I","L","F","M","W","C") # from BARNES 2003 (SEE WELLS PAPER)

FullDataset=FullDataset %>% mutate(HydrophobicCount =  stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)

1.3.8 Finalise TESLA vs Pathogenic datasets for analysis

TESLA = TESLA %>% mutate(Dataset = "Cancer")
FullDataset = FullDataset%>% mutate(Dataset = "Pathogenic")

# Take necessary columns of data
TESLASubset=TESLA %>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset, Rank, StabRank)
PathogenicSubset=FullDataset%>% select(Peptide,Immunogenicity, HLA_Allele, nM, "Thalf(h)", HydroFraction, Dataset, Rank,StabRank)
# Combine into one DT for analysis
combinedDataset = rbind(TESLASubset,PathogenicSubset)
# Look at number of observations
combinedDataset %>% select(Dataset,Immunogenicity)%>% table
##             Immunogenicity
## Dataset      Negative Positive
##   Cancer          571       37
##   Pathogenic    17664     6294
combinedDataset =combinedDataset%>% mutate(Dataset = factor(Dataset, levels = c("Pathogenic","Cancer")))

1.4 Results

1.4.1 Affinity comparison between TESLA vs pathogenic data

  • Compare binding affinity in nM for TESLA vs Pathogenic peptides
  • Generates Fig 2A
# Generate a density plot
BA_PATHTESLA_DENSITY_PLT= combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")

mycomparisons =list(c("Positive","Negative"))

# Take the median of the dataset for TESLA-Nve and TESLA-Pve to plot dashed lines.
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))

# Generate a violin plot, comparing immunogenicity status for 1) TESLA and 2) Pathogenic peptides.
BA_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

# Generate a boxplot
BA_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)

# ECDF plot. Split data first into four groups: pathogenic pve/nve, and cancer pve/nve.
BA_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(nM, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(nM, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=nM, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(nM)")

1.4.1.1 Fig 2A

plot_grid(BA_PATHTESLA_DENSITY_PLT, BA_PATHTESLA_BOX_PLT+theme(legend.position = "none"),BA_PATHTESLA_VIOLIN_PLT, BA_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.1.2 Can we perform HLA-specific analysis?

  • Far too few peptides per HLA for TESLA, even for HLA A0201.
combinedDataset  %>% group_by(Dataset, Immunogenicity, HLA_Allele)%>% dplyr::summarise(n=n()) %>% slice_max(n=10,order_by = n)%>% DT::datatable()
## `summarise()` has grouped output by 'Dataset', 'Immunogenicity'. You can override using the `.groups` argument.

1.4.1.3 Confirm results are consistent with for Rank score

  • HLA ligands bind different MHC in different nM ranges.
  • We cannot exmaine these effects in a HLA-specific manner, but we can use the rank score. According to the authors of netMHCpan: “This measure is not affected by inherent bias of certain molecules towards higher or lower mean predicted affinities.”
  • Here, we observe the same pattern as with nM, but more mildly.
RANK_PATHTESLA_DENSITY_PLT= combinedDataset  %>% ggplot(aes(x=Rank, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(Rank)")

mycomparisons =list(c("Positive","Negative"))

MEDIAN_DATA_TESLA_NEG = combinedDataset  %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(Rank))
MEDIAN_DATA_TESLA_POS = combinedDataset  %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(Rank))


RANK_PATHTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Rank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Binding Affinity\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

RANK_PATHTESLA_BOX_PLT=combinedDataset  %>% ggplot(aes(x=Immunogenicity, y=Rank, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(Rank)")+theme_pubr(base_size = 16)

RANK_PATHTESLA_ECDF_PLT=PathogenicSubset%>% select(Rank, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset%>% select(Rank, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=Rank, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(Rank)")
plot_grid(RANK_PATHTESLA_DENSITY_PLT, RANK_PATHTESLA_BOX_PLT+theme(legend.position = "none"),RANK_PATHTESLA_VIOLIN_PLT, RANK_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.2 Binding Stability

1.4.2.1 Hours

  • Examine binding stability (hrs) between groups.
# Density plot. Comparing cancer vs pathogens, grouped by immunogenicity status.
STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))
# Violin plot. Comparing immunogenicity status for pathogens and also for cancer.
STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>%
        ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
# Boxplot.
STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 16)
# ECDF
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")

1.4.2.2 Fig 2B

library(cowplot)

plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.2.3 Rank score.

STAB_PATHTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=StabRank, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(Rank)")

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(StabRank))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(StabRank))


STAB_PATHTESLA_VIOLIN_PLT=combinedDataset %>%
        ggviolin(x="Immunogenicity",y="StabRank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

STAB_PATHTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=StabRank, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+theme_pubr(base_size = 16)
STAB_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(StabRank, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(StabRank, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=StabRank, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(Rank)")
library(cowplot)

plot_grid(STAB_PATHTESLA_DENSITY_PLT, STAB_PATHTESLA_BOX_PLT+theme(legend.position = "none"),STAB_PATHTESLA_VIOLIN_PLT, STAB_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.3 Compare the fraction of hydrophobicity between groups

  • Compare the fraction of hydrophobicity between groups
HYDRO_PATHTESLA_DENSITY_PLT= combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% ggplot(aes(x=HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic")

MEDIAN_DATA_TESLA_NEG = combinedDataset%>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))

HYDRO_PATHTESLA_VIOLIN_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

HYDRO_PATHTESLA_BOX_PLT=combinedDataset %>% select(Peptide,Immunogenicity,Dataset,HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic\n ")+theme_pubr(base_size = 16)

HYDRO_PATHTESLA_ECDF_PLT=PathogenicSubset %>% select(HydroFraction, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>% distinct()%>%rbind(
  TESLASubset %>% select(HydroFraction, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")%>% distinct()
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic")

1.4.3.1 Fig 3C

plot_grid(HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),HYDRO_PATHTESLA_VIOLIN_PLT, HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.4 Examine fraction hydrophobicity in TCR contact residues

  • Reviewer’s comment : “The authors should check the hydrophobicity of amino acids in the non-anchor regions as well, because anchors determine HLA binding only, while the T cell response is mainly associated with non-anchor amino acids.”
  • To reduce confounding factors of this analysis, we only examine 9 and 10mers
  • We adopt the approach of Koncz et al, 2021 PNAS:
  • TCR contact residues for 9mers and positions 4-8
  • TCR contact residues for 10mers are positions 5-9

1.4.4.1 Filter 9/10mers

Hydro_frac_tcr_contact_dt = combinedDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length %in% c(9,10))

Hydro_frac_tcr_contact_dt %>% select(Dataset, Immunogenicity) %>% table
##             Immunogenicity
## Dataset      Negative Positive
##   Pathogenic    17056     5820
##   Cancer          487       35
# Seperate these data to apply different strategies to different lengths.
Hydro_frac_tcr_contact_dt_9 = combinedDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length %in% c(9))
Hydro_frac_tcr_contact_dt_10 = combinedDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length %in% c(10))
# 9mers: take positions 4-8
Hydro_frac_tcr_contact_dt_9=Hydro_frac_tcr_contact_dt_9%>% mutate(TCRContact_Peptide = substr(Peptide, start=4, stop=8))
# 10mers: take positions 5-9
Hydro_frac_tcr_contact_dt_10=Hydro_frac_tcr_contact_dt_10%>% mutate(TCRContact_Peptide = substr(Peptide, start=5, stop=9))
# Combine these data
Hydro_frac_tcr_contact_dt = rbind(Hydro_frac_tcr_contact_dt_9,Hydro_frac_tcr_contact_dt_10)
# compute hydro fraction of the 'TCR Contact Peptide'
## Get the length
Hydro_frac_tcr_contact_dt=Hydro_frac_tcr_contact_dt %>% mutate(TCRContact_Length = nchar(TCRContact_Peptide))
## Count the number of hydrophobic residues in the 'TCR contact peptide'. Then divide by the length of the TCR contact peptider (5).
Hydro_frac_tcr_contact_dt=Hydro_frac_tcr_contact_dt %>% mutate(HydrophobicCount =  stri_count_regex(Hydro_frac_tcr_contact_dt$TCRContact_Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(TCR_HydroFraction = HydrophobicCount/TCRContact_Length)

1.4.5 Visualise these data

# Generate density plot
TCR_HYDRO_PATHTESLA_DENSITY_PLT= Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% ggplot(aes(x=TCR_HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("Fraction Hydrophobic (TCR Contact)")

MEDIAN_DATA_TESLA_NEG = Hydro_frac_tcr_contact_dt%>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% filter(Dataset == 'Cancer', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(TCR_HydroFraction))
MEDIAN_DATA_TESLA_POS = Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% filter(Dataset == 'Cancer', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(TCR_HydroFraction))
# Generate violin plot
TCR_HYDRO_PATHTESLA_VIOLIN_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="TCR_HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("Fraction Hydrophobic (TCR Contact)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)
# Generate box plot
TCR_HYDRO_PATHTESLA_BOX_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=TCR_HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic (TCR Contact)")+theme_pubr(base_size = 16)
# Generate ECDF
TCR_HYDRO_PATHTESLA_ECDF_PLT=Hydro_frac_tcr_contact_dt %>% select(TCR_HydroFraction, Immunogenicity, Peptide, Dataset)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=TCR_HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic (TCR Contact)")
plot_grid(TCR_HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), TCR_HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),TCR_HYDRO_PATHTESLA_VIOLIN_PLT,TCR_HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

1.4.6 HLA specific plots

  • While due to low sample number of cancer peptides, we cannot compare pathogens vs cancer, but we can examine the pathogenic peptides in a HLA specific manner.
  • Here, we filter for 5 common HLAs, and compare affinity in nM, stability in hours and fraction of hydrophobicity for immunogenic vs nonimmunogenic pathogenic peptides

1.4.6.1 Binding affinity nM

PathogenicSubset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% select(HLA_Allele,Immunogenicity)%>% table
##             Immunogenicity
## HLA_Allele   Negative Positive
##   HLA-A01:01      843      229
##   HLA-A02:01     1624     1906
##   HLA-B07:02      688      401
##   HLA-B40:01      783      122
##   HLA-C07:02       12       41
FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+facet_wrap(~HLA_Allele,nrow=3)+theme_pubr(base_size = 14)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
    scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity (nM)")

1.4.6.2 Binding stability hrs

FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+facet_wrap(~HLA_Allele,nrow=3)+theme_pubr(base_size = 14)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
    scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability (hrs)")

1.4.6.3 Fraction of hydrophobicity for the full length peptide.

FullDataset %>% filter(HLA_Allele %in% grep('A01:01|A02:01|B07:02|B40:01|C07:02',HLA_Allele,value = T))%>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 14)+facet_wrap(~HLA_Allele,nrow=3)+stat_compare_means(label = "p.signif",label.x.npc = "center", label.y = 0.9)+ylab("Fraction Hydrophobic")+ylim(0,1.0)

2 Compare GBM w/ TESLA

2.1 Prepare datasets for comparison

2.1.1 GBM

  • TESLA already ready. So read in GBM and compute binding affinity in both nM and rank, binding stability in hrs and rank and fraction hydrophobicity.
  • Read in GBM dataset
  • Class the contradictory peptide as positive.
  • Filter for 9 and 10mers
  • leaves 123 peptides in total. 20% are immunogenic
FullDataset = readRDS("GBM_test_data.rds")

2.1.1.1 Run netMHCpan for GBM peptides

  • Generate binding affinity estimate
TEST_DATA_LOCATION = "GBM_EPITOPES_NETMHCPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    LENGTHS=FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])%>% pull(Length) %>% unique
    testdata=paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
    write.table(FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i]) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
    RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
    system(paste0("/Applications/netMHCpan-4.0/netMHCpan -BA -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",paste0(LENGTHS,collapse = ",")," -xls -xlsfile ", RESULTS_OUTPUT))
}

2.1.1.2 Read in nM and rank scores for GBM

  • Binding affinity
TEST_DATA_LOCATION = "GBM_EPITOPES_NETMHCPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))
FullDataset=FullDataset %>% inner_join(Netmhcpanres %>% select(! c(file,Pos,ID,core,icore))) %>% mutate(Binder = ifelse(NB==1,"BINDER","NONBINDER"))
## Joining, by = c("Peptide", "HLA_Allele")
FullDataset %>% nrow
## [1] 123

2.1.1.3 Run netMHCstabpan for GBM peptides

  • generate stability predictions

TEST_DATA_LOCATION = "GBM_EPITOPES_STABPAN/"
for(allele_i in 1:length(unique(FullDataset$HLA_Allele))){
    HLA_ALLELE_FOR_TESTING = gsub(x=unique(FullDataset$HLA_Allele)[allele_i],pattern=":",replacement = "")
    DATASET = FullDataset %>% filter(HLA_Allele %in% unique(FullDataset$HLA_Allele)[allele_i])
    for(i in 1:length(unique(DATASET$Length))){
        LENGTH = unique(DATASET$Length)[i]
        testdata=paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC_data.txt")
        write.table(DATASET %>% filter(Length == LENGTH) %>% select(Peptide) %>% pull,file=testdata,sep="\n",col.names = F,row.names = F,quote=F)
# Run model
        RESULTS_OUTPUT = paste0(TEST_DATA_LOCATION,"LENGTH_",LENGTH,"_Allele_",HLA_ALLELE_FOR_TESTING,"_NetMHC","_Results.csv")
        system(paste0("/Applications/netMHCstabpan-1.0/netMHCstabpan -p ",testdata," -a ",unique(FullDataset$HLA_Allele)[allele_i]," -l ",LENGTH," -xls -xlsfile ", RESULTS_OUTPUT))
    }
}

2.1.1.4 Read in netMHCstabpan scores for GBM peptides

  • Read in binding stability in hrs and rank
TEST_DATA_LOCATION = "GBM_EPITOPES_STABPAN/"

data_path <- TEST_DATA_LOCATION
files <- dir(data_path, pattern = "NetMHC_Results.csv")

data3 <- data_frame(file = files) %>%
  mutate(file_contents = map(file,
                             ~ fread(file.path(data_path, .),skip = 1))
  )

Netmhcpanres <- unnest(data3)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(file_contents)`
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$file,pattern="Allele_|_NetMHC_Results.csv",replacement = ""))
Netmhcpanres=Netmhcpanres %>% mutate(HLA_Allele = gsub(x=Netmhcpanres$HLA_Allele,pattern="LENGTH_[0-9]*_",replacement = ""))
# Map HLA nomenclature
Netmhcpanres$HLA_Allele = ClosestMatch2(Netmhcpanres$HLA_Allele,unique(FullDataset$HLA_Allele))

Netmhcpanres=Netmhcpanres %>% select(Peptide, "Thalf(h)",HLA_Allele, Rank) %>% dplyr::rename(StabRank=Rank)

FullDataset=FullDataset %>% inner_join(Netmhcpanres, by=c("Peptide"))
FullDataset %>% glimpse()
## Rows: 123
## Columns: 13
## $ Norm_peptide   <chr> ".........R", ".P.......", ".P........", ".P.......", "…
## $ Peptide        <chr> "FLEEIILKSL", "FLRESQNPL", "GLALGTPLSI", "GLAVNLSQI", "…
## $ HLA_Allele.x   <chr> "HLA-A02:01", "HLA-A02:01", "HLA-A02:01", "HLA-A02:01",…
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negative", "Negati…
## $ `1-log50k`     <dbl> 0.6756, 0.6498, 0.4990, 0.5065, 0.3949, 0.4457, 0.4416,…
## $ nM             <dbl> 33.4288, 44.2217, 226.0590, 208.4324, 696.8362, 402.439…
## $ Rank           <dbl> 0.4449, 0.5677, 1.7764, 1.6901, 3.3725, 2.4937, 2.5586,…
## $ Ave            <dbl> 0.6756, 0.6498, 0.4990, 0.5065, 0.3949, 0.4457, 0.4416,…
## $ NB             <int> 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1…
## $ Binder         <chr> "BINDER", "BINDER", "BINDER", "BINDER", "NONBINDER", "N…
## $ `Thalf(h)`     <dbl> 1.6064, 0.7949, 1.4697, 3.3911, 0.1720, 1.7038, 0.5679,…
## $ HLA_Allele.y   <chr> "HLA-A02:01", "HLA-A02:01", "HLA-A02:01", "HLA-A02:01",…
## $ StabRank       <dbl> 2.5, 5.5, 3.0, 1.1, 34.0, 2.5, 8.0, 3.5, 6.5, 3.5, 4.0,…

2.1.2 Compute fraction hydrophobicity

FullDataset=FullDataset%>% mutate(Length = nchar(Peptide)) %>% mutate(HydrophobicCount =  stri_count_regex(FullDataset$Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(HydroFraction = HydrophobicCount/Length)

TESLASubset=TESLA%>% mutate(Dataset="TESLA") %>% select(Peptide,Immunogenicity, nM, "Thalf(h)", HydroFraction, Dataset, Rank, StabRank)
GBMSubset=FullDataset%>% mutate(Dataset = "GBM")%>% select(Peptide,Immunogenicity, nM, "Thalf(h)", HydroFraction, Dataset, Rank, StabRank)

combinedDataset = rbind(TESLASubset,GBMSubset)
combinedDataset%>% glimpse()
## Rows: 731
## Columns: 8
## $ Peptide        <chr> "ASSFLKSFY", "ATLFSDSWYY", "CSDSGKSFINY", "CVDWLIAVY", …
## $ Immunogenicity <chr> "Negative", "Negative", "Negative", "Negative", "Negati…
## $ nM             <dbl> 132.1, 251.7, 64.8, 38.7, 24.0, 481.3, 22.7, 88.3, 29.1…
## $ `Thalf(h)`     <dbl> 0.7648, 2.2041, 2.2370, 2.2785, 1.2140, 0.5879, 1.8232,…
## $ HydroFraction  <dbl> 0.3333333, 0.3000000, 0.2727273, 0.6666667, 0.2222222, …
## $ Dataset        <chr> "TESLA", "TESLA", "TESLA", "TESLA", "TESLA", "TESLA", "…
## $ Rank           <dbl> 0.1363, 0.1680, 0.0289, 0.0315, 0.0233, 0.2913, 0.0148,…
## $ StabRank       <dbl> 0.700, 0.060, 0.060, 0.060, 0.300, 1.300, 0.100, 0.500,…

2.2 Results

2.2.1 Affinity

BA_GBMTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=nM, fill=Dataset))+geom_density(aes(y=..density..),alpha=0.4, bins=10,position = "identity")+facet_wrap(~Immunogenicity)+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Affinity\n(nM)")
## Warning: Ignoring unknown parameters: bins
mycomparisons =list(c("Positive","Negative"))

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(nM))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(nM))


BA_GBMTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="nM",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)


BA_GBMTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=nM, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(nM)")+theme_pubr(base_size = 16)



BA_GBMTESLA_ECDF_PLT=PathogenicSubset %>% select(nM, Immunogenicity, Peptide) %>% mutate(Dataset="Pathogenic")%>%rbind(
  TESLASubset %>% select(nM, Immunogenicity,Peptide)%>% mutate(Dataset = "Cancer")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=nM, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Affinity\n(nM)")
library(cowplot)

plot_grid(BA_GBMTESLA_DENSITY_PLT, BA_GBMTESLA_BOX_PLT,BA_GBMTESLA_VIOLIN_PLT+theme(legend.position = "none"),BA_GBMTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")

2.2.2 Binding affinity rank score

mycomparisons =list(c("Positive","Negative"))

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(Rank))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(Rank))


BA_GBMTESLA_RANK_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Rank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)


BA_GBMTESLA_RANK_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=Rank, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Affinity\n(Rank)")+theme_pubr(base_size = 16)

2.2.3 Binding stability in hrs

STAB_GBMTESLA_DENSITY_PLT=combinedDataset %>% ggplot(aes(x=`Thalf(h)`, fill=Dataset))+geom_density(aes(y=..density..),alpha=0.4, bins=10,position = "identity")+facet_wrap(~Immunogenicity)+theme_pubr(base_size = 16)+
    scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+xlab("Binding Stability\n(hrs)")
## Warning: Ignoring unknown parameters: bins
MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`Thalf(h)`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`Thalf(h)`))

STAB_GBMTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="Thalf(h)",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

STAB_GBMTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=`Thalf(h)`, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(hrs)")+theme_pubr(base_size = 16)

STAB_GBMTESLA_ECDF_PLT=GBMSubset %>% select(`Thalf(h)`, Immunogenicity, Peptide) %>% mutate(Dataset="GBM")%>%rbind(
  TESLASubset %>% select(`Thalf(h)`, Immunogenicity,Peptide)%>% mutate(Dataset = "TESLA")
)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=`Thalf(h)`, color=Dataset))+stat_ecdf(size=2)+scale_x_log10()+theme_pubr(base_size = 14)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Binding Stability\n(hrs)")

2.2.4 Binding stability in rank scores

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(`StabRank`))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(`StabRank`))

STAB_RANK_GBMTESLA_VIOLIN_PLT=combinedDataset %>% ggviolin(x="Immunogenicity",y="StabRank",add="boxplot")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

STAB_RANK_GBMTESLA_BOX_PLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=StabRank, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+
  scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))+ylab("Binding Stability\n(Rank)")+theme_pubr(base_size = 16)
library(cowplot)

plot_grid(BA_GBMTESLA_RANK_BOX_PLT, BA_GBMTESLA_RANK_VIOLIN_PLT,STAB_GBMTESLA_BOX_PLT+theme(legend.position = "none"),STAB_GBMTESLA_VIOLIN_PLT,nrow=1,align="hv", rel_widths = c(0.8,1,0.8,1),axis="bl")

2.2.5 Fraction of hydrophobicity

MEDIAN_DATA_TESLA_NEG = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(HydroFraction))
MEDIAN_DATA_TESLA_POS = combinedDataset %>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(HydroFraction))

GBMTESL_FRACTIONHYDRO_BXPLT_COMPARE_CANCER=combinedDataset %>% ggboxplot(x="Immunogenicity",y="HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

GBMTESL_FRACTIONHYDRO_BXPLT=combinedDataset %>% ggplot(aes(x=Immunogenicity, y=HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("Fraction Hydrophobic")+theme_pubr(base_size = 16)
plot_grid(STAB_RANK_GBMTESLA_BOX_PLT+theme(legend.position = "none"), STAB_RANK_GBMTESLA_VIOLIN_PLT,GBMTESL_FRACTIONHYDRO_BXPLT+theme(legend.position = "none"),GBMTESL_FRACTIONHYDRO_BXPLT_COMPARE_CANCER,nrow=1,align="hv", rel_widths = c(0.8,1,0.8,1),axis="bl")

2.2.6 Examine fraction hydrophobicity in TCR contact residues

  • Reviewer’s comment : “The authors should check the hydrophobicity of amino acids in the non-anchor regions as well, because anchors determine HLA binding only, while the T cell response is mainly associated with non-anchor amino acids.”
  • To reduce confounding factors of this analysis, we only examine 9 and 10mers
  • We adopt the approach of Koncz et al, 2021 PNAS:
  • TCR contact residues for 9mers and positions 4-8
  • TCR contact residues for 10mers are positions 5-9
Hydro_frac_tcr_contact_dt_9 = combinedDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length %in% c(9))
Hydro_frac_tcr_contact_dt_10 = combinedDataset %>% mutate(Length = nchar(Peptide)) %>% filter(Length %in% c(10))

Hydro_frac_tcr_contact_dt_9=Hydro_frac_tcr_contact_dt_9%>% mutate(TCRContact_Peptide = substr(Peptide, start=4, stop=8))
Hydro_frac_tcr_contact_dt_10=Hydro_frac_tcr_contact_dt_10%>% mutate(TCRContact_Peptide = substr(Peptide, start=5, stop=9))
Hydro_frac_tcr_contact_dt = rbind(Hydro_frac_tcr_contact_dt_9,Hydro_frac_tcr_contact_dt_10)
Hydro_frac_tcr_contact_dt %>% select(Dataset, Immunogenicity) %>% table
##        Immunogenicity
## Dataset Negative Positive
##   GBM         98       25
##   TESLA      487       35
# compute hydro fraction
Hydro_frac_tcr_contact_dt=Hydro_frac_tcr_contact_dt %>% mutate(TCRContact_Length = nchar(TCRContact_Peptide))

Hydro_frac_tcr_contact_dt=Hydro_frac_tcr_contact_dt %>% mutate(HydrophobicCount =  stri_count_regex(Hydro_frac_tcr_contact_dt$TCRContact_Peptide, paste0(HYDROPHOBIC_RESIDUES,collapse = "|"))) %>% mutate(TCR_HydroFraction = HydrophobicCount/TCRContact_Length)
TCR_HYDRO_PATHTESLA_DENSITY_PLT= Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% ggplot(aes(x=TCR_HydroFraction, fill=Dataset))+facet_wrap(~Immunogenicity)+geom_density(aes(y=..density..),alpha=0.4,position = "identity")+theme_pubr(base_size = 16)+xlab("TCR Contact Fraction Hydrophobic")

MEDIAN_DATA_TESLA_NEG = Hydro_frac_tcr_contact_dt%>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct() %>% filter(Dataset == 'TESLA', Immunogenicity == "Negative") %>% dplyr::summarise(median = median(TCR_HydroFraction))
MEDIAN_DATA_TESLA_POS = Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% filter(Dataset == 'TESLA', Immunogenicity == "Positive") %>% dplyr::summarise(median = median(TCR_HydroFraction))

TCR_HYDRO_PATHTESLA_VIOLIN_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggboxplot(x="Immunogenicity",y="TCR_HydroFraction")+theme_pubr(base_size = 16)+facet_grid(~Dataset)+stat_compare_means(label = "p.signif",label.x.npc = "center",comparisons = mycomparisons)+ylab("TCR Contact Fraction Hydrophobic")+ geom_hline(data=MEDIAN_DATA_TESLA_NEG,aes(yintercept=median), linetype="dashed", color = "red", size=0.5)+ geom_hline(data=MEDIAN_DATA_TESLA_POS,aes(yintercept=median), linetype="dashed", color = "green", size=0.5)

TCR_HYDRO_PATHTESLA_BOX_PLT=Hydro_frac_tcr_contact_dt %>% select(Peptide,Immunogenicity,Dataset,TCR_HydroFraction) %>% distinct()%>% ggplot(aes(x=Immunogenicity, y=TCR_HydroFraction, fill=Dataset))+
    geom_boxplot(alpha=0.3)+stat_compare_means(label = "p.signif",label.x.npc = "center")+ylab("TCR Contact Fraction Hydrophobic")+theme_pubr(base_size = 16)

TCR_HYDRO_PATHTESLA_ECDF_PLT=Hydro_frac_tcr_contact_dt %>% select(TCR_HydroFraction, Immunogenicity, Peptide, Dataset)%>% mutate(Dataset = paste0(Dataset,"_",Immunogenicity))%>% ggplot(aes(x=TCR_HydroFraction, color=Dataset))+stat_ecdf(size=2)+theme_pubr(base_size = 16)+grids()+ guides(color = guide_legend(nrow = 2))+ylab("Cumulative Freq. of Peptides")+xlab("Fraction Hydrophobic")
#plot_grid(TCR_HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), TCR_HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),TCR_HYDRO_PATHTESLA_VIOLIN_PLT, TCR_HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),#axis="bl")

plot_grid(TCR_HYDRO_PATHTESLA_DENSITY_PLT+rotate_x_text(angle=90), TCR_HYDRO_PATHTESLA_BOX_PLT+theme(legend.position = "none"),TCR_HYDRO_PATHTESLA_VIOLIN_PLT,TCR_HYDRO_PATHTESLA_ECDF_PLT,nrow=1,align="hv", rel_widths = c(1,0.75,1.05,1.2),axis="bl")